#-*-coding:utf-8;-*-
#from jieba_hotword import *
from gensim.models import Phrases
from gensim.models import phrases
from gensim import corpora
from gensim.models import ldamodel
import re
#try to s33k new phrases

"""
for i in L:
    df[i]["分词1"]=df[i]["分词"].apply(lambda x: x.split(" "))
    corpus+=list(df[i]["分词1"])
"""
THRESHOLD2=30
THRESHOLD3=THRESHOLD2//3
MINCOUNT=2
def getPhrases(corpus):
    """
    param=list of list of words cut
    return=(list of list of bigrams, list of list of trigrams)
    """
    bigram = Phrases(corpus, min_count=MINCOUNT, threshold=THRESHOLD2)
    trigram = Phrases(corpus, min_count=MINCOUNT//2, threshold=THRESHOLD3)
    #threshold (float) – The minimum score for a bigram to be taken into account.
    bigram_mod = phrases.Phraser(bigram)
    trigram_mod = phrases.Phraser(trigram)
    
    make_bigrams=lambda txt: [bigram_mod[doc] for doc in txt]
    make_trigrams=lambda txt: [trigram_mod[bigram_mod[doc]] for doc in txt]
    
    data_words_bigrams = make_bigrams(corpus)
    data_words_trigrams = make_trigrams(corpus)
    #print(data_words_bigrams[:5])
    
    #提取组合词
    all_bigrams=[[j.replace("_"," ") for j in i if re.match(r".+_",j)] for i in data_words_bigrams]
    all_trigrams=[[j.replace("_"," ") for j in i if re.match(r".+_.+_",j)] for i in data_words_bigrams]
    #几乎没有三字词 sum([len(i) for i in all_trigrams])==0
    return all_bigrams,all_trigrams
    
    


